In [ ]:
#Importing required libraries
In [ ]:
import pandas as pd
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
import datetime, warnings, scipy

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [ ]:
# We have identified the small airlines based on the cutoff <20000 number of flights and the result is 10 airlines
#Frontier, GoJet,Alaska,AirWisconsin,Commute,Hawaiian,Horizon Air,Mesa,Piedmont,Spirit Airlines
#We created a database table with all the data and used a view which contains below columns. For ease of our analysis we created separate csv files for each small airline and loaded them to the dataframe
In [ ]:
#Loading the dataframes for each small airline as listed above
In [ ]:
frontier=pd.read_csv('/content/drive/MyDrive/Data/F9.csv')
In [ ]:
gojet=pd.read_csv('/content/drive/MyDrive/Data/G7.csv')
In [ ]:
alaska=pd.read_csv('/content/drive/MyDrive/Data/AS.csv')
In [ ]:
airwis=pd.read_csv('/content/drive/MyDrive/Data/ZW.csv')
In [ ]:
commute=pd.read_csv('/content/drive/MyDrive/Data/C5.csv')
In [ ]:
hawa=pd.read_csv('/content/drive/MyDrive/Data/HA.csv')
In [ ]:
pied=pd.read_csv('/content/drive/MyDrive/Data/PT.csv')
In [ ]:
spirit=pd.read_csv('/content/drive/MyDrive/Data/NK.csv')
In [ ]:
horizon=pd.read_csv('/content/drive/MyDrive/Data/QX.csv')
In [ ]:
mesa=pd.read_csv('/content/drive/MyDrive/Data/YV.csv')
In [ ]:
#Now that we have the data for each airline we will be merging the dataframes to perform EDA
#Note:- We again break this merged data frame into precovid and postcovid as it is easier to handle data that way due to low computational power of our machines
In [ ]:
df=[frontier,gojet,alaska,airwis,commute,hawa,pied,spirit,horizon,mesa]
In [ ]:
small_airlines = pd.concat(df, ignore_index=True, sort=False)
small_airlines
Out[ ]:
year month flightdate DayOfWeek DayofMonth Operating_Airline OriginCityName OriginStateName DestCityName DestStateName ... SecurityDelay LateAircraftDelay TaxiIn TaxiOut WheelsOff WheelsOn Origin Dest Tail_Number Description
0 2022 2 2022-02-01 2 1 F9 Las Vegas, NV Nevada San Diego, CA California ... NaN NaN 5.0 15.0 1807 1857 LAS SAN N206FR Frontier Airlines Inc.
1 2022 2 2022-02-01 2 1 F9 San Diego, CA California Las Vegas, NV Nevada ... NaN NaN 11.0 18.0 2009 2057 SAN LAS N206FR Frontier Airlines Inc.
2 2022 2 2022-02-01 2 1 F9 Las Vegas, NV Nevada Orlando, FL Florida ... 0.0 0.0 12.0 16.0 33 707 LAS MCO N206FR Frontier Airlines Inc.
3 2022 2 2022-02-01 2 1 F9 Denver, CO Colorado Fort Myers, FL Florida ... NaN NaN 10.0 16.0 55 600 DEN RSW N207FR Frontier Airlines Inc.
4 2022 2 2022-02-01 2 1 F9 Fort Myers, FL Florida Buffalo, NY New York ... NaN NaN 7.0 24.0 742 1018 RSW BUF N207FR Frontier Airlines Inc.
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5975536 2022 1 2022-01-03 1 3 YV Cedar Rapids/Iowa City, IA Iowa Dallas/Fort Worth, TX Texas ... 0.0 60.0 31.0 13.0 1755 1933 CID DFW N957LR Mesa Airlines Inc.
5975537 2022 1 2022-01-01 6 1 YV Dallas/Fort Worth, TX Texas Mobile, AL Alabama ... NaN NaN 3.0 24.0 1802 1909 DFW MOB N958LR Mesa Airlines Inc.
5975538 2022 1 2022-01-02 7 2 YV Dallas/Fort Worth, TX Texas Mobile, AL Alabama ... 0.0 0.0 6.0 18.0 1903 2014 DFW MOB N919FJ Mesa Airlines Inc.
5975539 2022 1 2022-01-01 6 1 YV Mobile, AL Alabama Dallas/Fort Worth, TX Texas ... NaN NaN 22.0 9.0 1958 2134 MOB DFW N958LR Mesa Airlines Inc.
5975540 2022 1 2022-01-02 7 2 YV Mobile, AL Alabama Dallas/Fort Worth, TX Texas ... 0.0 60.0 12.0 10.0 2133 2302 MOB DFW N919FJ Mesa Airlines Inc.

5975541 rows × 34 columns

In [ ]:
#Identifying unique values for the data
for col in small_airlines.select_dtypes(include='object').columns:
    print(col)
    print(small_airlines[col].unique())
flightdate
['2022-02-01' '2022-02-02' '2022-02-03' ... '2022-01-29' '2022-01-30'
 '2022-01-31']
Operating_Airline
['F9' 'G7' 'AS' 'ZW' 'C5' 'HA' 'PT' 'NK' 'QX' 'YV']
OriginCityName
['Las Vegas, NV' 'San Diego, CA' 'Denver, CO' 'Fort Myers, FL'
 'Buffalo, NY' 'Orlando, FL' 'Miami, FL' 'Philadelphia, PA' 'Tampa, FL'
 'Cleveland, OH' 'Baltimore, MD' 'Phoenix, AZ' 'Trenton, NJ'
 'West Palm Beach/Palm Beach, FL' 'Chicago, IL' 'Atlanta, GA' 'Islip, NY'
 'Grand Rapids, MI' 'Houston, TX' 'Little Rock, AR' 'Boston, MA'
 'Washington, DC' 'Reno, NV' 'Providence, RI' 'Dallas/Fort Worth, TX'
 'Minneapolis, MN' 'Madison, WI' 'Ontario, CA' 'Salt Lake City, UT'
 'Sacramento, CA' 'Oakland, CA' 'Raleigh/Durham, NC' 'San Juan, PR'
 'Santa Ana, CA' 'San Francisco, CA' 'New Orleans, LA' 'Portland, ME'
 'Portland, OR' 'Cincinnati, OH' 'Newark, NJ' 'Hartford, CT'
 'Charlotte, NC' 'Omaha, NE' 'Detroit, MI' 'Fayetteville, AR'
 'Seattle, WA' 'Kansas City, MO' 'St. Louis, MO' 'Pensacola, FL'
 'Syracuse, NY' 'Indianapolis, IN' 'Rochester, NY' 'Jacksonville, FL'
 'New York, NY' 'Burbank, CA' 'Milwaukee, WI' 'Austin, TX'
 'Cedar Rapids/Iowa City, IA' 'Colorado Springs, CO' 'Norfolk, VA'
 'Harrisburg, PA' 'Nashville, TN' 'Tulsa, OK' 'Knoxville, TN'
 'Newburgh/Poughkeepsie, NY' 'Fresno, CA' 'Albuquerque, NM'
 'Sioux Falls, SD' 'El Paso, TX' 'Green Bay, WI' 'Fargo, ND'
 'Bloomington/Normal, IL' 'Des Moines, IA' 'Harlingen/San Benito, TX'
 'Tucson, AZ' 'San Antonio, TX' 'Huntsville, AL' 'Wilmington, DE'
 'Oklahoma City, OK' 'Durango, CO' 'Sarasota/Bradenton, FL' 'Spokane, WA'
 'Columbus, OH' 'Memphis, TN' 'Louisville, KY' 'Burlington, VT'
 'Albany, NY' 'Pittsburgh, PA' 'Fort Lauderdale, FL' 'Charleston, SC'
 'Charlotte Amalie, VI' 'Bismarck/Mandan, ND' 'Myrtle Beach, SC'
 'Aguadilla, PR' 'Savannah, GA' 'Grand Junction, CO' 'Branson, MO'
 'Kalispell, MT' 'Missoula, MT' 'Bozeman, MT' 'Los Angeles, CA'
 'Palm Springs, CA' 'San Jose, CA' 'Billings, MT' 'Wichita, KS'
 'Jackson, WY' 'Jackson/Vicksburg, MS' 'Boise, ID' 'Anchorage, AK'
 'Greer, SC' 'Mobile, AL' 'Tyler, TX' 'Santa Barbara, CA' 'Portsmouth, NH'
 'Lafayette, LA' 'Birmingham, AL' 'Greensboro/High Point, NC'
 'Springfield, MO' 'Presque Isle/Houlton, ME' 'Richmond, VA'
 'Traverse City, MI' 'Manchester, NH' 'Wilmington, NC' 'Akron, OH'
 'Hilton Head, SC' 'Flint, MI' 'Scranton/Wilkes-Barre, PA' 'Asheville, NC'
 'State College, PA' 'Ithaca/Cortland, NY' 'Lincoln, NE' 'Nantucket, MA'
 'Allentown/Bethlehem/Easton, PA' 'Dayton, OH' 'Roanoke, VA' 'Bangor, ME'
 'Peoria, IL' 'Saginaw/Bay City/Midland, MI' 'Moline, IL' 'Lexington, KY'
 'Kalamazoo, MI' 'Lansing, MI' 'Appleton, WI' 'Hayden, CO'
 'Montrose/Delta, CO' 'Pasco/Kennewick/Richland, WA' 'Bend/Redmond, OR'
 'Casper, WY' 'Midland/Odessa, TX' 'Eagle, CO' 'Gunnison, CO'
 'Elmira/Corning, NY' 'Rapid City, SD' 'Medford, OR' 'Brownsville, TX'
 'South Bend, IN' 'Cody, WY' 'Great Falls, MT' 'Kodiak, AK' 'Bethel, AK'
 'Barrow, AK' 'Deadhorse, AK' 'Juneau, AK' 'Ketchikan, AK' 'Yakutat, AK'
 'Cordova, AK' 'Sitka, AK' 'Petersburg, AK' 'Wrangell, AK' 'Fairbanks, AK'
 'Dallas, TX' 'Nome, AK' 'Kotzebue, AK' 'Honolulu, HI' 'Kahului, HI'
 'Lihue, HI' 'Kona, HI' 'Adak Island, AK' 'Everett, WA' 'King Salmon, AK'
 'Gustavus, AK' 'Dillingham, AK' 'Santa Rosa, CA' 'Cold Bay, AK'
 'Eugene, OR' 'Bellingham, WA' 'White Plains, NY' 'Charleston/Dunbar, WV'
 'Springfield, IL' 'Duluth, MN' 'Columbia, SC' 'Erie, PA'
 'Charlottesville, VA' 'Chattanooga, TN' 'Panama City, FL'
 'Fort Wayne, IN' 'Columbia, MO' 'Mosinee, WI' 'Evansville, IN'
 'Valparaiso, FL' 'Waterloo, IA' 'La Crosse, WI' 'Rochester, MN'
 'Fayetteville, NC' 'Newport News/Williamsburg, VA' 'Champaign/Urbana, IL'
 'Baton Rouge, LA' 'Shreveport, LA' 'Flagstaff, AZ' 'Dickinson, ND'
 'Texarkana, AR' 'Gulfport/Biloxi, MS' 'Laredo, TX' 'Corpus Christi, TX'
 'Mission/McAllen/Edinburg, TX' 'Lubbock, TX' 'Santa Fe, NM'
 'Lake Charles, LA' 'Amarillo, TX' 'Alexandria, LA' 'Idaho Falls, ID'
 'Helena, MT' 'Hobbs, NM' 'College Station/Bryan, TX' 'Killeen, TX'
 'Monroe, LA' 'Minot, ND' 'Long Beach, CA' 'Hilo, HI' 'Pago Pago, TT'
 'Montgomery, AL' 'Greenville, NC' 'Bristol/Johnson City/Kingsport, TN'
 'Lynchburg, VA' 'Florence, SC' 'Ashland, WV' 'Augusta, GA'
 'Watertown, NY' 'Tallahassee, FL' 'Salisbury, MD' 'Gainesville, FL'
 'Jacksonville/Camp Lejeune, NC' 'Toledo, OH' 'Columbus, GA'
 'New Bern/Morehead/Beaufort, NC' 'Williamsport, PA' 'Worcester, MA'
 'Melbourne, FL' 'Atlantic City, NJ' 'Latrobe, PA' 'Christiansted, VI'
 'Niagara Falls, NY' 'Ponce, PR' 'Plattsburgh, NY' 'Yakima, WA'
 'Pullman, WA' 'Walla Walla, WA' 'Wenatchee, WA' 'San Luis Obispo, CA'
 'Sun Valley/Hailey/Ketchum, ID' 'Redding, CA' 'Monterey, CA'
 'Lewiston, ID' 'Mammoth Lakes, CA' 'Key West, FL' 'Yuma, AZ'
 'Bakersfield, CA' 'Daytona Beach, FL' 'Fort Smith, AR'
 'Lawton/Fort Sill, OK' 'Roswell, NM' 'Joplin, MO' 'Grand Island, NE']
OriginStateName
['Nevada' 'California' 'Colorado' 'Florida' 'New York' 'Pennsylvania'
 'Ohio' 'Maryland' 'Arizona' 'New Jersey' 'Illinois' 'Georgia' 'Michigan'
 'Texas' 'Arkansas' 'Massachusetts' 'Virginia' 'Rhode Island' 'Minnesota'
 'Wisconsin' 'Utah' 'North Carolina' 'Puerto Rico' 'Louisiana' 'Maine'
 'Oregon' 'Kentucky' 'Connecticut' 'Nebraska' 'Washington' 'Missouri'
 'Indiana' 'Iowa' 'Tennessee' 'Oklahoma' 'New Mexico' 'South Dakota'
 'North Dakota' 'Alabama' 'Delaware' 'Vermont' 'South Carolina'
 'U.S. Virgin Islands' 'Montana' 'Kansas' 'Wyoming' 'Mississippi' 'Idaho'
 'Alaska' 'New Hampshire' 'Hawaii' 'West Virginia'
 'U.S. Pacific Trust Territories and Possessions']
DestCityName
['San Diego, CA' 'Las Vegas, NV' 'Orlando, FL' 'Fort Myers, FL'
 'Buffalo, NY' 'Miami, FL' 'Philadelphia, PA' 'Tampa, FL' 'Cleveland, OH'
 'Baltimore, MD' 'Chicago, IL' 'Denver, CO'
 'West Palm Beach/Palm Beach, FL' 'Trenton, NJ' 'Atlanta, GA' 'Islip, NY'
 'Grand Rapids, MI' 'Houston, TX' 'Little Rock, AR' 'Newark, NJ'
 'Washington, DC' 'Phoenix, AZ' 'Reno, NV' 'Providence, RI'
 'Dallas/Fort Worth, TX' 'Minneapolis, MN' 'Madison, WI' 'Ontario, CA'
 'Salt Lake City, UT' 'Sacramento, CA' 'Oakland, CA' 'Raleigh/Durham, NC'
 'San Juan, PR' 'Santa Ana, CA' 'San Francisco, CA' 'New York, NY'
 'New Orleans, LA' 'Portland, ME' 'Cincinnati, OH' 'Hartford, CT'
 'Detroit, MI' 'Boston, MA' 'Charlotte, NC' 'Omaha, NE' 'Fayetteville, AR'
 'St. Louis, MO' 'Kansas City, MO' 'Pensacola, FL' 'Syracuse, NY'
 'Indianapolis, IN' 'Rochester, NY' 'Jacksonville, FL' 'Burbank, CA'
 'Milwaukee, WI' 'Austin, TX' 'Cedar Rapids/Iowa City, IA'
 'Colorado Springs, CO' 'Norfolk, VA' 'Harrisburg, PA' 'Nashville, TN'
 'Tulsa, OK' 'Seattle, WA' 'Knoxville, TN' 'Newburgh/Poughkeepsie, NY'
 'Fresno, CA' 'Portland, OR' 'Albuquerque, NM' 'Sioux Falls, SD'
 'El Paso, TX' 'Green Bay, WI' 'Fargo, ND' 'Bloomington/Normal, IL'
 'Des Moines, IA' 'Harlingen/San Benito, TX' 'Tucson, AZ'
 'San Antonio, TX' 'Huntsville, AL' 'Wilmington, DE' 'Oklahoma City, OK'
 'Durango, CO' 'Sarasota/Bradenton, FL' 'Spokane, WA' 'Memphis, TN'
 'Columbus, OH' 'Charlotte Amalie, VI' 'Louisville, KY' 'Burlington, VT'
 'Albany, NY' 'Pittsburgh, PA' 'Fort Lauderdale, FL' 'Charleston, SC'
 'Bismarck/Mandan, ND' 'Myrtle Beach, SC' 'Aguadilla, PR' 'Savannah, GA'
 'Grand Junction, CO' 'Branson, MO' 'Kalispell, MT' 'Missoula, MT'
 'Los Angeles, CA' 'Bozeman, MT' 'Palm Springs, CA' 'San Jose, CA'
 'Billings, MT' 'Wichita, KS' 'Jackson, WY' 'Jackson/Vicksburg, MS'
 'Boise, ID' 'Anchorage, AK' 'Greer, SC' 'Mobile, AL' 'Tyler, TX'
 'Santa Barbara, CA' 'Portsmouth, NH' 'Lafayette, LA' 'Birmingham, AL'
 'Greensboro/High Point, NC' 'Wilmington, NC' 'Richmond, VA'
 'Springfield, MO' 'Traverse City, MI' 'Manchester, NH' 'Akron, OH'
 'Presque Isle/Houlton, ME' 'Hilton Head, SC' 'Flint, MI'
 'Scranton/Wilkes-Barre, PA' 'State College, PA' 'Ithaca/Cortland, NY'
 'Asheville, NC' 'Lincoln, NE' 'Nantucket, MA' 'Dayton, OH'
 'Allentown/Bethlehem/Easton, PA' 'Roanoke, VA' 'Bangor, ME' 'Peoria, IL'
 'Saginaw/Bay City/Midland, MI' 'Moline, IL' 'Lexington, KY'
 'Kalamazoo, MI' 'Lansing, MI' 'Appleton, WI' 'Hayden, CO'
 'Montrose/Delta, CO' 'Pasco/Kennewick/Richland, WA' 'Bend/Redmond, OR'
 'Casper, WY' 'Midland/Odessa, TX' 'Eagle, CO' 'Gunnison, CO'
 'Elmira/Corning, NY' 'Rapid City, SD' 'Medford, OR' 'Brownsville, TX'
 'South Bend, IN' 'Cody, WY' 'Great Falls, MT' 'Bethel, AK' 'Barrow, AK'
 'Deadhorse, AK' 'Ketchikan, AK' 'Juneau, AK' 'Cordova, AK' 'Yakutat, AK'
 'Sitka, AK' 'Wrangell, AK' 'Petersburg, AK' 'Honolulu, HI' 'Kodiak, AK'
 'Fairbanks, AK' 'Nome, AK' 'Kotzebue, AK' 'Dallas, TX' 'Kahului, HI'
 'Kona, HI' 'Lihue, HI' 'Adak Island, AK' 'Everett, WA' 'King Salmon, AK'
 'Gustavus, AK' 'Dillingham, AK' 'Santa Rosa, CA' 'Cold Bay, AK'
 'Eugene, OR' 'Bellingham, WA' 'Columbia, SC' 'Erie, PA'
 'Charlottesville, VA' 'Springfield, IL' 'Charleston/Dunbar, WV'
 'White Plains, NY' 'Duluth, MN' 'Panama City, FL' 'Chattanooga, TN'
 'Fort Wayne, IN' 'Evansville, IN' 'Columbia, MO' 'Mosinee, WI'
 'Valparaiso, FL' 'Waterloo, IA' 'La Crosse, WI' 'Rochester, MN'
 'Fayetteville, NC' 'Newport News/Williamsburg, VA' 'Champaign/Urbana, IL'
 'Gulfport/Biloxi, MS' 'Lubbock, TX' 'Shreveport, LA' 'Amarillo, TX'
 'Corpus Christi, TX' 'Baton Rouge, LA' 'Laredo, TX' 'Texarkana, AR'
 'Alexandria, LA' 'Dickinson, ND' 'Mission/McAllen/Edinburg, TX'
 'Flagstaff, AZ' 'Lake Charles, LA' 'Santa Fe, NM' 'Idaho Falls, ID'
 'Helena, MT' 'Hobbs, NM' 'Killeen, TX' 'College Station/Bryan, TX'
 'Monroe, LA' 'Minot, ND' 'Long Beach, CA' 'Hilo, HI' 'Pago Pago, TT'
 'Lynchburg, VA' 'Bristol/Johnson City/Kingsport, TN' 'Watertown, NY'
 'Salisbury, MD' 'Florence, SC' 'Greenville, NC' 'Augusta, GA'
 'Montgomery, AL' 'Tallahassee, FL' 'Ashland, WV' 'Gainesville, FL'
 'Jacksonville/Camp Lejeune, NC' 'Toledo, OH' 'Columbus, GA'
 'New Bern/Morehead/Beaufort, NC' 'Williamsport, PA' 'Worcester, MA'
 'Melbourne, FL' 'Atlantic City, NJ' 'Latrobe, PA' 'Christiansted, VI'
 'Niagara Falls, NY' 'Ponce, PR' 'Plattsburgh, NY' 'Yakima, WA'
 'Walla Walla, WA' 'Pullman, WA' 'Wenatchee, WA' 'San Luis Obispo, CA'
 'Sun Valley/Hailey/Ketchum, ID' 'Redding, CA' 'Monterey, CA'
 'Lewiston, ID' 'Mammoth Lakes, CA' 'Key West, FL' 'Yuma, AZ'
 'Bakersfield, CA' 'Daytona Beach, FL' 'Fort Smith, AR'
 'Lawton/Fort Sill, OK' 'Roswell, NM' 'Joplin, MO' 'Grand Island, NE']
DestStateName
['California' 'Nevada' 'Florida' 'New York' 'Pennsylvania' 'Ohio'
 'Maryland' 'Illinois' 'Colorado' 'New Jersey' 'Georgia' 'Michigan'
 'Texas' 'Arkansas' 'Virginia' 'Arizona' 'Rhode Island' 'Minnesota'
 'Wisconsin' 'Utah' 'North Carolina' 'Puerto Rico' 'Louisiana' 'Maine'
 'Kentucky' 'Connecticut' 'Massachusetts' 'Nebraska' 'Missouri' 'Indiana'
 'Iowa' 'Tennessee' 'Oklahoma' 'Washington' 'Oregon' 'New Mexico'
 'South Dakota' 'North Dakota' 'Alabama' 'Delaware' 'U.S. Virgin Islands'
 'Vermont' 'South Carolina' 'Montana' 'Kansas' 'Wyoming' 'Mississippi'
 'Idaho' 'Alaska' 'New Hampshire' 'Hawaii' 'West Virginia'
 'U.S. Pacific Trust Territories and Possessions']
Origin
['LAS' 'SAN' 'DEN' 'RSW' 'BUF' 'MCO' 'MIA' 'PHL' 'TPA' 'CLE' 'BWI' 'PHX'
 'TTN' 'PBI' 'ORD' 'ATL' 'ISP' 'GRR' 'IAH' 'LIT' 'BOS' 'DCA' 'RNO' 'PVD'
 'DFW' 'MSP' 'MSN' 'ONT' 'SLC' 'SMF' 'OAK' 'RDU' 'SJU' 'SNA' 'SFO' 'MSY'
 'PWM' 'PDX' 'CVG' 'EWR' 'BDL' 'CLT' 'OMA' 'DTW' 'XNA' 'SEA' 'MCI' 'STL'
 'PNS' 'SYR' 'IND' 'ROC' 'JAX' 'LGA' 'BUR' 'MKE' 'AUS' 'CID' 'COS' 'ORF'
 'MDT' 'BNA' 'TUL' 'TYS' 'SWF' 'FAT' 'ABQ' 'FSD' 'ELP' 'GRB' 'FAR' 'BMI'
 'DSM' 'HRL' 'TUS' 'SAT' 'HSV' 'ILG' 'OKC' 'DRO' 'SRQ' 'GEG' 'CMH' 'MEM'
 'SDF' 'BTV' 'ALB' 'PIT' 'FLL' 'CHS' 'STT' 'BIS' 'MYR' 'BQN' 'SAV' 'GJT'
 'MDW' 'BKG' 'HOU' 'FCA' 'MSO' 'BZN' 'IAD' 'LAX' 'PSP' 'SJC' 'BIL' 'ICT'
 'JAC' 'JAN' 'BOI' 'ANC' 'GSP' 'BFM' 'TYR' 'SBA' 'PSM' 'LFT' 'BHM' 'GSO'
 'SGF' 'PQI' 'RIC' 'TVC' 'MHT' 'ILM' 'CAK' 'HHH' 'FNT' 'AVP' 'AVL' 'SCE'
 'ITH' 'LNK' 'ACK' 'ABE' 'DAY' 'ROA' 'BGR' 'PIA' 'MBS' 'MLI' 'LEX' 'AZO'
 'LAN' 'ATW' 'HDN' 'MTJ' 'PSC' 'RDM' 'CPR' 'MAF' 'EGE' 'GUC' 'ELM' 'RAP'
 'MFR' 'BRO' 'SBN' 'COD' 'GTF' 'JFK' 'ADQ' 'BET' 'BRW' 'SCC' 'JNU' 'KTN'
 'YAK' 'CDV' 'SIT' 'PSG' 'WRG' 'FAI' 'DAL' 'OME' 'OTZ' 'HNL' 'OGG' 'LIH'
 'KOA' 'ADK' 'PAE' 'AKN' 'GST' 'DLG' 'STS' 'CDB' 'EUG' 'BLI' 'HPN' 'CRW'
 'SPI' 'DLH' 'CAE' 'ERI' 'CHO' 'CHA' 'ECP' 'FWA' 'COU' 'CWA' 'EVV' 'VPS'
 'ALO' 'LSE' 'RST' 'FAY' 'PHF' 'CMI' 'BTR' 'SHV' 'FLG' 'DIK' 'TXK' 'GPT'
 'LRD' 'CRP' 'MFE' 'LBB' 'MOB' 'SAF' 'LCH' 'AMA' 'AEX' 'IDA' 'HLN' 'HOB'
 'CLL' 'GRK' 'MLU' 'MOT' 'LGB' 'ITO' 'PPG' 'MGM' 'PGV' 'TRI' 'LYH' 'FLO'
 'HTS' 'AGS' 'ART' 'TLH' 'SBY' 'GNV' 'OAJ' 'TOL' 'CSG' 'EWN' 'IPT' 'ORH'
 'MLB' 'ACY' 'LBE' 'STX' 'IAG' 'PSE' 'PBG' 'YKM' 'PUW' 'ALW' 'EAT' 'SBP'
 'SUN' 'RDD' 'MRY' 'LWS' 'MMH' 'EYW' 'YUM' 'BFL' 'DAB' 'FSM' 'LAW' 'ROW'
 'JLN' 'GRI']
Dest
['SAN' 'LAS' 'MCO' 'RSW' 'BUF' 'MIA' 'PHL' 'TPA' 'CLE' 'BWI' 'ORD' 'DEN'
 'PBI' 'TTN' 'ATL' 'ISP' 'GRR' 'IAH' 'LIT' 'EWR' 'DCA' 'PHX' 'RNO' 'PVD'
 'DFW' 'MSP' 'MSN' 'ONT' 'SLC' 'SMF' 'OAK' 'RDU' 'SJU' 'SNA' 'SFO' 'LGA'
 'MSY' 'PWM' 'CVG' 'BDL' 'DTW' 'BOS' 'CLT' 'OMA' 'XNA' 'STL' 'MCI' 'PNS'
 'SYR' 'IND' 'ROC' 'JAX' 'BUR' 'MKE' 'AUS' 'CID' 'COS' 'ORF' 'MDT' 'BNA'
 'TUL' 'SEA' 'TYS' 'SWF' 'FAT' 'PDX' 'ABQ' 'FSD' 'ELP' 'GRB' 'FAR' 'BMI'
 'DSM' 'HRL' 'TUS' 'SAT' 'HSV' 'ILG' 'OKC' 'DRO' 'SRQ' 'GEG' 'MEM' 'CMH'
 'STT' 'SDF' 'BTV' 'ALB' 'PIT' 'FLL' 'CHS' 'BIS' 'MYR' 'BQN' 'SAV' 'GJT'
 'MDW' 'BKG' 'HOU' 'FCA' 'MSO' 'LAX' 'BZN' 'IAD' 'PSP' 'SJC' 'BIL' 'ICT'
 'JAC' 'JAN' 'BOI' 'ANC' 'GSP' 'BFM' 'TYR' 'SBA' 'PSM' 'LFT' 'BHM' 'GSO'
 'ILM' 'RIC' 'SGF' 'TVC' 'MHT' 'CAK' 'PQI' 'HHH' 'FNT' 'AVP' 'SCE' 'ITH'
 'AVL' 'LNK' 'ACK' 'DAY' 'ABE' 'ROA' 'BGR' 'PIA' 'MBS' 'MLI' 'LEX' 'AZO'
 'LAN' 'ATW' 'HDN' 'MTJ' 'PSC' 'RDM' 'CPR' 'MAF' 'EGE' 'GUC' 'ELM' 'RAP'
 'MFR' 'BRO' 'SBN' 'COD' 'GTF' 'JFK' 'BET' 'BRW' 'SCC' 'KTN' 'JNU' 'CDV'
 'YAK' 'SIT' 'WRG' 'PSG' 'HNL' 'ADQ' 'FAI' 'OME' 'OTZ' 'DAL' 'OGG' 'KOA'
 'LIH' 'ADK' 'PAE' 'AKN' 'GST' 'DLG' 'STS' 'CDB' 'EUG' 'BLI' 'CAE' 'ERI'
 'CHO' 'SPI' 'CRW' 'HPN' 'DLH' 'ECP' 'CHA' 'FWA' 'EVV' 'COU' 'CWA' 'VPS'
 'ALO' 'LSE' 'RST' 'FAY' 'PHF' 'CMI' 'GPT' 'LBB' 'SHV' 'AMA' 'CRP' 'BTR'
 'LRD' 'TXK' 'MOB' 'AEX' 'DIK' 'MFE' 'FLG' 'LCH' 'SAF' 'IDA' 'HLN' 'HOB'
 'GRK' 'CLL' 'MLU' 'MOT' 'LGB' 'ITO' 'PPG' 'LYH' 'TRI' 'ART' 'SBY' 'FLO'
 'PGV' 'AGS' 'MGM' 'TLH' 'HTS' 'GNV' 'OAJ' 'TOL' 'CSG' 'EWN' 'IPT' 'ORH'
 'MLB' 'ACY' 'LBE' 'STX' 'IAG' 'PSE' 'PBG' 'YKM' 'ALW' 'PUW' 'EAT' 'SBP'
 'SUN' 'RDD' 'MRY' 'LWS' 'MMH' 'EYW' 'YUM' 'BFL' 'DAB' 'FSM' 'LAW' 'ROW'
 'JLN' 'GRI']
Tail_Number
['N206FR' 'N207FR' 'N211FR' ... 'N926LR' 'N243LR' 'N407SW']
Description
['Frontier Airlines Inc.' 'GoJet Airlines LLC d/b/a United Express'
 'Alaska Airlines Inc.' 'Air Wisconsin Airlines Corp'
 'CommuteAir LLC dba CommuteAir' 'Hawaiian Airlines Inc.'
 'Piedmont Airlines' 'Spirit Air Lines' 'Horizon Air' 'Mesa Airlines Inc.']
In [ ]:
#Finding missing values in the data
missing_values=[features for features in small_airlines.columns if small_airlines[features].isnull().sum()>0]
for feature in missing_values:
    {print(feature,np.round(small_airlines[feature].isnull().mean(),4), '% of missing values')}
CarrierDelay 0.8086 % of missing values
WeatherDelay 0.8086 % of missing values
NASDelay 0.8086 % of missing values
SecurityDelay 0.8086 % of missing values
LateAircraftDelay 0.8086 % of missing values
In [ ]:
#We can see that the causes of delay fields have many missing values which are null values and we are imputing these columns with 0 for our analysis
In [ ]:
small_airlines['CarrierDelay'].fillna(0, inplace=True)
small_airlines['WeatherDelay'].fillna(0, inplace=True)
small_airlines['NASDelay'].fillna(0, inplace=True)
small_airlines['SecurityDelay'].fillna(0, inplace=True)
small_airlines['LateAircraftDelay'].fillna(0, inplace=True)
In [ ]:
#To Checking distinct values for feature and decide if that column needs to be included in the analysis
for column in small_airlines.columns:
    print(column,small_airlines[column].nunique())
year 6
month 12
flightdate 1915
DayOfWeek 7
DayofMonth 31
Operating_Airline 10
OriginCityName 273
OriginStateName 53
DestCityName 273
DestStateName 53
DepTime 1440
DepDelay 1540
DepDelayMinutes 1457
DepDel15 2
ArrTime 1440
ArrDelay 1580
ArrDelayMinutes 1468
ArrDel15 2
CRSDepTime 1359
CRSArrTime 1432
Distance 694
CarrierDelay 1315
WeatherDelay 984
NASDelay 905
SecurityDelay 208
LateAircraftDelay 1173
TaxiIn 204
TaxiOut 189
WheelsOff 1440
WheelsOn 1440
Origin 278
Dest 278
Tail_Number 1313
Description 10
In [ ]:
#Identifying the number of categorical and numerical variables in dataset
numerical_variables =  list(small_airlines.select_dtypes(include=['int64','float64']).drop('ArrDel15', axis=1).columns)
categorical_variables =  list(small_airlines.select_dtypes(include=['object']).columns)
In [ ]:
numerical_variables#24 numerical features
Out[ ]:
['year',
 'month',
 'DayOfWeek',
 'DayofMonth',
 'DepTime',
 'DepDelay',
 'DepDelayMinutes',
 'DepDel15',
 'ArrTime',
 'ArrDelay',
 'ArrDelayMinutes',
 'CRSDepTime',
 'CRSArrTime',
 'Distance',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay',
 'TaxiIn',
 'TaxiOut',
 'WheelsOff',
 'WheelsOn']
In [ ]:
categorical_variables#10 categorical features
Out[ ]:
['flightdate',
 'Operating_Airline',
 'OriginCityName',
 'OriginStateName',
 'DestCityName',
 'DestStateName',
 'Origin',
 'Dest',
 'Tail_Number',
 'Description']
In [ ]:
#Finding the variable distribution for the data
plt.figure(figsize=(12,80), facecolor='white')
plotnumber =1
for cat in categorical_variables:
    ax = plt.subplot(14,1,plotnumber)
    sns.countplot(y=cat,data=small_airlines)
    #plt.xlabel(categorical_variables)
    plotnumber+=1
plt.show()
In [ ]:
#Check target label split over categorical features and find the count for the dataset
for cat in categorical_variables:
    print(small_airlines.groupby(['ArrDel15',cat]).size())
ArrDel15  flightdate
0.0       2018-01-01    2268
          2018-01-02    2418
          2018-01-03    2350
          2018-01-04    2131
          2018-01-05    2304
                        ... 
1.0       2023-04-26     713
          2023-04-27     950
          2023-04-28     911
          2023-04-29     783
          2023-04-30    1030
Length: 3830, dtype: int64
ArrDel15  Operating_Airline
0.0       AS                   910950
          C5                   222734
          F9                   492849
          G7                   255105
          HA                   300887
          NK                   785013
          PT                   400531
          QX                   455119
          YV                   677316
          ZW                   331279
1.0       AS                   192204
          C5                    74673
          F9                   169911
          G7                    63412
          HA                    50364
          NK                   200458
          PT                    78146
          QX                    75818
          YV                   163770
          ZW                    75002
dtype: int64
ArrDel15  OriginCityName 
0.0       Adak Island, AK      325
          Aguadilla, PR       2141
          Akron, OH           6417
          Albany, NY         15844
          Albuquerque, NM    12700
                             ...  
1.0       Worcester, MA        151
          Wrangell, AK         490
          Yakima, WA           720
          Yakutat, AK          513
          Yuma, AZ             226
Length: 546, dtype: int64
ArrDel15  OriginStateName
0.0       Alabama             30962
          Alaska             145991
          Arizona            103746
          Arkansas            13741
          California         413171
                              ...  
1.0       Virginia            58157
          Washington          98349
          West Virginia        1306
          Wisconsin           10414
          Wyoming               784
Length: 106, dtype: int64
ArrDel15  DestCityName   
0.0       Adak Island, AK      398
          Aguadilla, PR       2060
          Akron, OH           6453
          Albany, NY         15048
          Albuquerque, NM    12384
                             ...  
1.0       Worcester, MA        103
          Wrangell, AK         649
          Yakima, WA           644
          Yakutat, AK          553
          Yuma, AZ             254
Length: 546, dtype: int64
ArrDel15  DestStateName
0.0       Alabama           30386
          Alaska           142753
          Arizona          104473
          Arkansas          13525
          California       407030
                            ...  
1.0       Virginia          54907
          Washington        97485
          West Virginia      1235
          Wisconsin         10972
          Wyoming             726
Length: 106, dtype: int64
ArrDel15  Origin
0.0       ABE        3980
          ABQ       12700
          ACK         930
          ACY       12732
          ADK         325
                    ...  
1.0       WRG         490
          XNA        1381
          YAK         513
          YKM         720
          YUM         226
Length: 556, dtype: int64
ArrDel15  Dest
0.0       ABE      3913
          ABQ     12384
          ACK      1032
          ACY     12273
          ADK       398
                  ...  
1.0       WRG       649
          XNA      1530
          YAK       553
          YKM       644
          YUM       254
Length: 556, dtype: int64
ArrDel15  Tail_Number
0.0       N10156         1104
          N101NK           20
          N11106          814
          N11107          897
          N11113          933
                         ... 
1.0       N979NK           28
          N980NK           45
          N999FR            6
          NZ52AW           18
          NZ55AW           18
Length: 2625, dtype: int64
ArrDel15  Description                            
0.0       Air Wisconsin Airlines Corp                331279
          Alaska Airlines Inc.                       910950
          CommuteAir LLC dba CommuteAir              222734
          Frontier Airlines Inc.                     492849
          GoJet Airlines LLC d/b/a United Express    255105
          Hawaiian Airlines Inc.                     300887
          Horizon Air                                455119
          Mesa Airlines Inc.                         677316
          Piedmont Airlines                          400531
          Spirit Air Lines                           785013
1.0       Air Wisconsin Airlines Corp                 75002
          Alaska Airlines Inc.                       192204
          CommuteAir LLC dba CommuteAir               74673
          Frontier Airlines Inc.                     169911
          GoJet Airlines LLC d/b/a United Express     63412
          Hawaiian Airlines Inc.                      50364
          Horizon Air                                 75818
          Mesa Airlines Inc.                         163770
          Piedmont Airlines                           78146
          Spirit Air Lines                           200458
dtype: int64
In [ ]:
#Exploring and analyzing the numerical variables
# list of numerical variables
numerical_features = [feature for feature in small_airlines.columns if ((small_airlines[feature].dtypes != 'O') & (feature not in ['outcome']))]
print('Number of numerical variables: ', len(numerical_features))

# visualise the numerical variables
small_airlines[numerical_features].head()
Number of numerical variables:  24
Out[ ]:
year month DayOfWeek DayofMonth DepTime DepDelay DepDelayMinutes DepDel15 ArrTime ArrDelay ... Distance CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay TaxiIn TaxiOut WheelsOff WheelsOn
0 2022 2 2 1 1752 -6.0 0.0 0.0 1902 -11.0 ... 50.0 0.0 0.0 0.0 0.0 0.0 5.0 15.0 1807 1857
1 2022 2 2 1 1951 -7.0 0.0 0.0 2108 -13.0 ... 48.0 0.0 0.0 0.0 0.0 0.0 11.0 18.0 2009 2057
2 2022 2 2 1 17 90.0 90.0 1.0 719 71.0 ... 214.0 71.0 0.0 0.0 0.0 0.0 12.0 16.0 33 707
3 2022 2 2 1 39 19.0 19.0 1.0 610 11.0 ... 185.0 0.0 0.0 0.0 0.0 0.0 10.0 16.0 55 600
4 2022 2 2 1 718 -12.0 0.0 0.0 1025 0.0 ... 156.0 0.0 0.0 0.0 0.0 0.0 7.0 24.0 742 1018

5 rows × 24 columns

In [ ]:
#Finding discrete numerical variables
discrete_feature=[feature for feature in numerical_features if len(small_airlines[feature].unique())<25]
print("Discrete Variables Count: {}".format(len(discrete_feature)))
Discrete Variables Count: 5
In [ ]:
#Identifying continuous numerical variables
continuous_features=[feature for feature in numerical_features if feature not in discrete_feature+['ArrDel15']]
print("Continuous feature Count {}".format(len(continuous_features)))
Continuous feature Count 19
In [ ]:
#Distribution of the continuous numerical variables
plt.figure(figsize=(20,60), facecolor='white')
plotnumber =1
for continuous_feature in continuous_features:
    ax = plt.subplot(12,3,plotnumber)
    sns.distplot(small_airlines[continuous_feature])
    plt.xlabel(continuous_feature)
    plotnumber+=1
plt.show()
In [ ]:
#boxplot to show target distribution with respect to numerical features
plt.figure(figsize=(20,60), facecolor='white')
plotnumber =1
for feature in continuous_features:
    ax = plt.subplot(12,3,plotnumber)
    sns.boxplot(x="ArrDel15", y= small_airlines[feature], data=small_airlines)
    plt.xlabel(feature)
    plotnumber+=1
plt.show()
In [36]:
#Identifying outliers on numerical features
plt.figure(figsize=(20,60), facecolor='white')
plotnumber =1
for numerical_feature in numerical_features:
    ax = plt.subplot(12,3,plotnumber)
    sns.boxplot(small_airlines[numerical_feature])
    plt.xlabel(numerical_feature)
    plotnumber+=1
plt.show()
In [37]:
df_numericals = small_airlines[numerical_variables]
df_numericals.head()
Out[37]:
year month DayOfWeek DayofMonth DepTime DepDelay DepDelayMinutes DepDel15 ArrTime ArrDelay ... Distance CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay TaxiIn TaxiOut WheelsOff WheelsOn
0 2022 2 2 1 1752 -6.0 0.0 0.0 1902 -11.0 ... 50.0 0.0 0.0 0.0 0.0 0.0 5.0 15.0 1807 1857
1 2022 2 2 1 1951 -7.0 0.0 0.0 2108 -13.0 ... 48.0 0.0 0.0 0.0 0.0 0.0 11.0 18.0 2009 2057
2 2022 2 2 1 17 90.0 90.0 1.0 719 71.0 ... 214.0 71.0 0.0 0.0 0.0 0.0 12.0 16.0 33 707
3 2022 2 2 1 39 19.0 19.0 1.0 610 11.0 ... 185.0 0.0 0.0 0.0 0.0 0.0 10.0 16.0 55 600
4 2022 2 2 1 718 -12.0 0.0 0.0 1025 0.0 ... 156.0 0.0 0.0 0.0 0.0 0.0 7.0 24.0 742 1018

5 rows × 23 columns

In [38]:
#Finding the correlation between numerical features
## Checking for correlation
cor_mat=df_numericals.corr()
fig = plt.figure(figsize=(15,7))
sns.heatmap(cor_mat,annot=True)
Out[38]:
<Axes: >
In [39]:
#Checking if the dataset is balanced with respect to the target variable
sns.countplot(x='ArrDel15',data=small_airlines)
plt.show()
In [45]:
plt.figure(figsize=(20, 10))
sns.set(font_scale=1.6)
axis = sns.countplot(x=small_airlines['Description'], data=small_airlines, order=small_airlines['Description'].value_counts().iloc[0:18].index, orient="v")
axis.set_xticklabels(axis.get_xticklabels(), rotation=90, ha='right')
plt.title('TOTAL NUMBER OF FLIGHTS BY AIRLINES', fontsize=24)
plt.xlabel('OPERATING AIRLINE', fontsize=18)
plt.ylabel('NUMBER OF FLIGHTS', fontsize=18)
plt.tight_layout()
plt.show()

This plot tells us that the Top 5 Small Airlines that have most domestic flights are: Alaska Airlines Inc. Spirit Airlines Mesa Airlines Inc. Frontier Airlines Inc. Horizon Air

In [46]:
small_airlines.Description.value_counts()#Double checking if the counts match
Out[46]:
Alaska Airlines Inc.                       1103154
Spirit Air Lines                            985471
Mesa Airlines Inc.                          841086
Frontier Airlines Inc.                      662760
Horizon Air                                 530937
Piedmont Airlines                           478677
Air Wisconsin Airlines Corp                 406281
Hawaiian Airlines Inc.                      351251
GoJet Airlines LLC d/b/a United Express     318517
CommuteAir LLC dba CommuteAir               297407
Name: Description, dtype: int64

Now we will calculate the Total Number of flights that are delayed by Airline

In [ ]:
plt.figure(figsize=(20, 10))
small_airlines.groupby('Description').ArrDel15.sum().sort_values(ascending=False).plot.bar()
plt.title('TOTAL DELAYED FLIGHTS BY AIRLINE', fontsize=20)
plt.xlabel('AIRLINE NAME', fontsize=16)
plt.ylabel('NUMBER OF FLIGHTS', fontsize=16)
plt.rc('xtick',labelsize=10)
plt.rc('ytick',labelsize=10)
plt.show()

As per the Total Number of delays per airline we can see that the Top 5 Airlines are as below:

  • Spirit Airlines
  • Alaska Airlines
  • Frontier Airlines
  • Mesa Airlines
  • Piedmont Airlines

Here Piedmont Airlines and Horizon Air have swapped places where Horizon Air is the only airline which was in the Top 5 flights list but ended up in the 6th position in terms of delay

In [47]:
small_airlines.ArrDel15.value_counts(normalize=True)
Out[47]:
0.0    0.808593
1.0    0.191407
Name: ArrDel15, dtype: float64

Here we are trying to identify the percentage of airlines that were delayed as per our target variable (ArrDel15) and we can see that around 19.14% flights are delayed and around 80.85% of the flights have been on time.

In [48]:
# Now we calculate the percentage of flights delayed
df1_PFD = small_airlines[['Description', 'ArrDel15']]
df1_PFD.head()
Out[48]:
Description ArrDel15
0 Frontier Airlines Inc. 0.0
1 Frontier Airlines Inc. 0.0
2 Frontier Airlines Inc. 1.0
3 Frontier Airlines Inc. 0.0
4 Frontier Airlines Inc. 0.0
In [49]:
df1_PFD.shape
Out[49]:
(5975541, 2)
In [50]:
# Non-normalized value_counts
df1_PFD.ArrDel15.value_counts()
Out[50]:
0.0    4831783
1.0    1143758
Name: ArrDel15, dtype: int64
In [51]:
t1 = df1_PFD.groupby(['Description']).sum().reset_index()

t2 = small_airlines.Description.value_counts().rename_axis('Description').reset_index(name='TOTAL_FLIGHTS')

df_fp = pd.merge(t1, t2, on='Description')

df_fp['PERCENTAGE_DELAYED'] = round((df_fp['ArrDel15'] * 100) / df_fp['TOTAL_FLIGHTS'], 2)
df_fp.rename({'ArrDel15': 'DELAYED_FLIGHTS'}, axis=1, inplace=True)

df_fp.head(20)
Out[51]:
Description DELAYED_FLIGHTS TOTAL_FLIGHTS PERCENTAGE_DELAYED
0 Air Wisconsin Airlines Corp 75002.0 406281 18.46
1 Alaska Airlines Inc. 192204.0 1103154 17.42
2 CommuteAir LLC dba CommuteAir 74673.0 297407 25.11
3 Frontier Airlines Inc. 169911.0 662760 25.64
4 GoJet Airlines LLC d/b/a United Express 63412.0 318517 19.91
5 Hawaiian Airlines Inc. 50364.0 351251 14.34
6 Horizon Air 75818.0 530937 14.28
7 Mesa Airlines Inc. 163770.0 841086 19.47
8 Piedmont Airlines 78146.0 478677 16.33
9 Spirit Air Lines 200458.0 985471 20.34
In [52]:
SUM = df_fp.DELAYED_FLIGHTS.sum()#The total flights count match with the above count
print('Total number of delayed flights:', SUM)
Total number of delayed flights: 1143758.0
In [53]:
SUM1= df_fp.PERCENTAGE_DELAYED.sum()# Here we can see that the total percentage is going above 100% the reason could be that most airlines have more percentage delay than the overall percentage delay i.e. greater than 19%
print('Total percentage of delayed flights:', SUM1)
Total percentage of delayed flights: 191.29999999999998
In [54]:
# Plot with the 19.14% as threshold value
plt.figure(figsize=(20, 10))
df_fp.groupby('Description').PERCENTAGE_DELAYED.sum().sort_values(ascending=False).plot.bar(fontsize=14)
plt.hlines(y=19.14, xmin=-1, xmax=18, colors='r', linestyles='solid', label='cutoff')
plt.title('PERCENTAGE OF DELAYED FLIGHTS BY AIRLINE', fontsize=20)
plt.xlabel('AIRLINE', fontsize=16)
plt.ylabel('PERCENTAGE', fontsize=16)
plt.rc('xtick',labelsize=14)
plt.rc('ytick',labelsize=14)
plt.show()

--The top 5 Airlines by percentage has now changed we can see that Commute Air and GoJet Airlines were not part of our top 5 list earlier but in terms of delay percentage they are in the top 5. --Frontier airlines have arrived around 74.36% on time which is the lowest and the highest here is Horizon Air which is 85.72%, this is interesting as Horizon Air is 6th in the number of flights list and still has managed to maintain lower arrival delay times. --Frontier Airlines was 3rd in the number of flights but has the highest number of flight delays amongst them all. --Another point to note is for Commute Air which was 8th in the list of number of flights but is ranking 2nd in terms of delay

In [56]:
plt.figure(figsize=(20, 10))
small_airlines.groupby('Description').ArrDelay.sum().sort_values(ascending=False).plot.bar()
plt.title('TOTAL DELAYED MINUTES BY AIRLINE', fontsize=18)
plt.xlabel('AIRLINE', fontsize=14)
plt.ylabel('TOTAL DELAYED MINUTES', fontsize=14)
plt.rc('xtick',labelsize=10)
plt.rc('ytick',labelsize=10)
plt.show()

Here Alaska Airlines which was among the top 5 Airlines stands out as it has lower Arrival delay minutes as compared to the other airlines while Mesa Airlines tops the list

In [ ]:
small_airlines.head(2)
Out[ ]:
year month DayOfWeek flightdate Operating_Airline OriginCityName Dest Origin DestCityName DestState ... Tail_Number DOT_ID_Operating_Airline Flight_Number_Operating_Airline OriginStateName DestStateName ArrTimeBlk CancellationCode FirstDepTime TotalAddGTime LongestAddGTime
0 2022 2 2.0 2022-02-01 F9 Las Vegas, NV SAN LAS San Diego, CA CA ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2022 2 2.0 2022-02-01 F9 San Diego, CA LAS SAN Las Vegas, NV NV ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

2 rows × 45 columns

In [ ]:
#Calculating the Average Arrival Delay
test_3 = small_airlines[['Description', 'ArrDelay']]
test_3 = test_3.groupby(['Description']).mean().reset_index()
test_3.rename({'ArrDelay': 'AVG_DELAY'}, axis=1, inplace=True)
test_3 = test_3.sort_values(['AVG_DELAY']).reset_index(drop=True)
test_3.head(18)
Out[ ]:
Description AVG_DELAY
0 Alaska Airlines Inc. 0.363510
1 Horizon Air 1.024617
2 Piedmont Airlines 1.706491
3 Hawaiian Airlines Inc. 2.507848
4 Air Wisconsin Airlines Corp 5.381155
5 Spirit Air Lines 6.429750
6 GoJet Airlines LLC d/b/a United Express 7.030058
7 Mesa Airlines Inc. 8.140483
8 Frontier Airlines Inc. 9.971267
9 CommuteAir LLC dba CommuteAir 14.626418

We can see that Alaska Airlines is lower in terms of Average Arrival delay but, what catches the attention is Commute Air which was 4th in the list by Total Delayed minutes is ranking higher in the list of Average Arrival delay

In [ ]:
#Plot shows the averages that we calculate above
fig = plt.figure(1, figsize=(15,9))
sns.set(font_scale=1.6)
ax = sns.barplot(x='AVG_DELAY', y='Description', data=test_3)
ax.set_title('AVERAGE ARRIVAL DELAY BY AIRLINE (mins)', fontsize=18)
ax.set_xlabel('AVERAGE DELAY BY AIRLINE (min)', fontsize=14)
plt.show()
In [ ]:
#Top 20 destinations where these airlines travel the most
plt.figure(figsize=(20, 10))
sns.set(font_scale=1.6)
axis = sns.countplot(x=small_airlines['DestCityName'], data=small_airlines, order=small_airlines['DestCityName'].value_counts().iloc[0:20].index)
axis.set_xticklabels(axis.get_xticklabels(), rotation=90, ha='right')
plt.title('Top 20 Destination Cities', fontsize=20)
plt.xlabel('20 Top Destination Cities (DestCityName)', fontsize=14)
plt.ylabel('Total Number of Flights', fontsize=14)
plt.tight_layout()
plt.show()
In [61]:
#Top 5 Airports that have the most Arrival Delay
plt.figure(figsize=(20, 10))
small_airlines.groupby('Dest').ArrDelay.sum().sort_values(ascending=False).plot.bar()
axis = sns.countplot(x=small_airlines['Dest'], data=small_airlines, order=small_airlines['Dest'].value_counts().iloc[0:5].index)
plt.title('TOTAL DELAYED MINUTES BY AIRLINE', fontsize=18)
plt.xlabel('DESTINATION AIRPORT', fontsize=14)
plt.ylabel('TOTAL DELAYED MINUTES', fontsize=14)
plt.rc('xtick',labelsize=10)
plt.rc('ytick',labelsize=10)
plt.show()

We can see that the top 5 Airports where delays have been observed with respect to Arrival are:

  1. SEA: Seattle, WA
  2. ORD: Chicago, IL
  3. IAH: Houston,TX
  4. DFW: Dallas/Fort Worth, TX
  5. IAD: Washington, DC
In [66]:
#Worse and Best months to travel based on flights delay
plt.figure(figsize=(20, 10))
small_airlines.groupby('month').ArrDelay.sum().sort_values(ascending=False).plot.bar()
plt.title('Number of Delayed Flights per Month', fontsize=18)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Number of Flights', fontsize=14)
plt.show()

We can see that the best months to travel or most travel is during the summer months that is June to August. Followed by February and December which are holiday seasons The worst month is September which could be due to start of the Fall season when the weather is changing and that could cause delays

In [68]:
#Day of Week vs Arrival Delay to check is there a specific day of the week when Arrival delays are high
plt.figure(figsize=(20, 10))
barplot =small_airlines.groupby('DayOfWeek').ArrDel15.sum().sort_values(ascending=False).plot.bar()
plt.title('Number of Delayed Flights per day of the week', fontsize=18)
plt.xlabel('Day of the week', fontsize=14)
plt.ylabel('Number of Flights', fontsize=14)
plt.show()

We can see that Friday's have the highest number of flight delays while Saturday's have the lowest number of flight delays, which could be due to more number of people travelling after work on Friday's for weekend trips. The 2nd highest is on Monday which could be because its the start of the week and people need to rejoin their work

In [70]:
#Day of Month vs Arrival Delay to check is there a specific day of the week when Arrival delays are high
plt.figure(figsize=(20, 10))
barplot =small_airlines.groupby('DayofMonth').ArrDel15.sum().sort_values(ascending=False).plot.bar()
plt.title('Number of Delayed Flights per day of the month', fontsize=18)
plt.xlabel('Day of the month', fontsize=14)
plt.ylabel('Number of Flights', fontsize=14)
plt.show()
In [74]:
#Analyze Departure vs Arrival Delays
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams['hatch.linewidth'] = 1.8

fig = plt.figure(1, figsize=(15,9))
ax = sns.barplot(x="DepDelay", y="Description", data=small_airlines, color="green", ci=None)
ax = sns.barplot(x="ArrDelay", y="Description", data=small_airlines, color="r", hatch = '///', alpha = 0.0, ci=None)
ax.yaxis.label.set_visible(False)
plt.title('Departure vs Arrival Delay')
plt.xlabel('Mean delay (min)', fontsize=14, labelpad=10);
fig
Out[74]:

Based on the above analysis it is clear that Arrival delays are lower for all the airlines while the departure delays are higher for all of them. The highest departure delay is for Commute Air, followed by Frontier Airlines and GoJet Airlines which shows that airlines try their best to overcome the departure delays by trying to reduce their causes of Arrival delays and reaching the destination airports on time. But this is only based on the factors that they can control.

In [ ]:
#Most popular Destination Airport with Average Arrival Delays
In [83]:
t4 = small_airlines[['Dest', 'ArrDelay']]
t4 = t4.groupby(['Dest']).mean().reset_index()
t4.head(10)
Out[83]:
Dest ArrDelay
0 ABE 2.193191
1 ABQ 6.713205
2 ACK 10.526746
3 ACY 5.982557
4 ADK -0.005871
5 ADQ -0.671023
6 AEX 14.659603
7 AGS 3.930588
8 AKN 2.007199
9 ALB 9.801680
In [84]:
t5 = small_airlines.Dest.value_counts().rename_axis('Dest').reset_index(name='FLIGHTS').iloc[0:30]
t5.head(30)
Out[84]:
Dest FLIGHTS
0 SEA 495093
1 ORD 279372
2 IAH 242419
3 DFW 199013
4 IAD 187601
5 PHL 186390
6 LAS 184502
7 MCO 174800
8 PDX 168150
9 DEN 166584
10 HNL 158547
11 LAX 133642
12 EWR 123151
13 CLT 118138
14 PHX 115202
15 FLL 108796
16 DTW 100531
17 SFO 94607
18 ATL 84328
19 OGG 80517
20 ANC 75107
21 SAN 66275
22 TPA 54332
23 BWI 46093
24 KOA 44889
25 CLE 43445
26 LIH 42771
27 BOS 41619
28 GEG 41559
29 SJC 36654
In [88]:
t4_5 = pd.merge(t5, t4, on='Dest')
t4_5.head()
Out[88]:
Dest FLIGHTS ArrDelay
0 SEA 495093 0.949551
1 ORD 279372 6.321399
2 IAH 242419 4.903065
3 DFW 199013 10.597730
4 IAD 187601 6.041220
In [89]:
t4_5 = t4_5.set_index('Dest')

fig = plt.figure(figsize=(15,7)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as a
width = .3

t4_5.FLIGHTS.plot(kind='bar',color='grey',ax=ax, width=width, position=0)
t4_5.ArrDelay.plot(kind='bar',color='blue', ax=ax2, width=width, position=1)

plt.hlines(y=4.90, xmin=-1, xmax=30, colors='b', linestyles='dashed', label='cutoff')

#ax.grid(None, axis=1)
ax2.grid(None)

ax.set_title('MOST POPULAR AIRPORTS vs AVERAGE ARRIVAL DELAY', fontsize=18)
ax.set_ylabel('NUMBER OF FLIGHTS', fontsize=16)
ax2.set_ylabel('AVERAGE ARRIVAL DELAY (mins)')
ax.set_xlabel('DESTINATION AIRPORTS', fontsize=16)

plt.show()

We conclude the Exploratory Data Analysis here the next part we start working with the model and decide which columns to be used which will be covered in another python notebook